Use the Random Forest implementation of scikit-learn to perform a 10-fold cross-validation on inbalanced data.
In [1]:
def computeCV(data):
# http://scikit-learn.org/dev/modules/classes.html#module-sklearn.cross_validation
from sklearn import ensemble, cross_validation
clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
# http://randomforests.wordpress.com/2014/02/02/basics-of-k-fold-cross-validation-and-gridsearchcv-in-scikit-learn/
res = cross_validation.cross_val_score(clf, data.data, data.target, cv=10, n_jobs = 5)
print(res)
def dtime_to_seconds(dtime):
return dtime.seconds + (dtime.microseconds * 1e-6)
def bench(func, data, n=10):
assert n > 2
score = np.inf
try:
time = []
for i in range(n):
score, t = func(*data)
time.append(dtime_to_seconds(t))
# remove extremal values
time.pop(np.argmax(time))
time.pop(np.argmin(time))
except Exception as detail:
print('%s error in function %s: ', (repr(detail), func))
time = []
return score, np.array(time)
def bench_skl(X, y, T, valid):
from sklearn import ensemble #, linear_model
#from sklearn.utils import safe_asarray
start = datetime.now()
# balance the dataset
# https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/preprocessing/_weights.py
y = np.searchsorted(np.unique(y), y)
class_weight_bins = np.bincount(y)
# from class weights to sample weights
sample_weights = 1. / class_weight_bins.take(y)
sample_weights *= class_weight_bins.min()
# http://scikit-learn.org/stable/modules/classes.html
clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
#clf = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.5)
#clf = linear_model.LogisticRegression()
#clf = neighbors.NeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute_inplace')
#clf = skl_cluster.KMeans(k=n_components, n_init=1)
#...
clf.fit(X, y, sample_weights)
## Regression
# pred = clf.predict(T)
# delta = datetime.now() - start
# mse = np.linalg.norm(pred - valid, 2) ** 2
# return mse, delta
# Classification
score = np.mean(clf.predict(T) == valid)
return score, datetime.now() - start
def computeAverageFromNRuns(data, num_tries, TH):
sample_range = np.random.random_sample(size=iris.target.shape[0])
X = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
Y = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
T = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
valid = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
#X, T, y, valid = cross_validation.train_test_split(iris.data, iris.target, test_size=0.9, random_state=0)
num_tries = 25
score, times = bench(bench_skl, (X,Y,T,valid), num_tries)
print('Tries:', num_tries, 'Score:', score, 'Time:', np.mean(times), '(mean)', np.median(times), '(median)')
from sklearn import datasets
import numpy as np
from datetime import datetime
#from sklearn import cross_validation
iris = datasets.load_iris()
computeCV(iris)
runs = 25
TH = 0.9
computeAverageFromNRuns(iris, runs, TH)